From 8a181133e2b48289ff83f5efdb4bdcb289346413 Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Fri, 26 Aug 2005 11:02:14 +0000 Subject: [PATCH] The patched attached enables x86_64 xenlinux with "late pin, early unpin", which is already implemented for x86_32. Since we now only pin the root rather than any of the other levels, the overall performance became better especially with workloads that require heavy memory management operations. On 8-way x86_64 xenlinux (dom0) the kernel build was improved by about 10% (using make -j32). Even a small setup like a UP HT system, I see about 3% performance gain with kernel build (make -j4). Lmbench also shows improvements in fork/exec/sh: Processor, Processes - times in microseconds - smaller is better -------------------------------------------------------------------- Host OS Mhz null null open slct sig sig fork exec sh =20 call I/O stat clos TCP inst hndl proc proc proc --------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ----=20 Linux 2.6.12- 3786 1.13 1.36 3.93 6.04 10.5 1.43 4.33 536. 1446 3614 Linux 2.6.12- 3786 1.13 1.36 3.91 6.03 10.4 1.44 4.38 346. 1050 2831 Signed-off-by: Jun Nakajima --- .../arch/xen/x86_64/kernel/ldt.c | 12 +- .../arch/xen/x86_64/mm/init.c | 1 + .../arch/xen/x86_64/mm/pageattr.c | 150 ++++++++++++++++-- .../include/asm-xen/asm-x86_64/mmu.h | 33 ++++ .../include/asm-xen/asm-x86_64/mmu_context.h | 6 + .../include/asm-xen/asm-x86_64/pgalloc.h | 118 ++++++++------ .../include/asm-xen/asm-x86_64/tlbflush.h | 2 +- 7 files changed, 263 insertions(+), 59 deletions(-) create mode 100644 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c index 855128a4b2..72bde0ec16 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c @@ -105,14 +105,19 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) struct mm_struct * old_mm; int retval = 0; + memset(&mm->context, 0, sizeof(mm->context)); init_MUTEX(&mm->context.sem); - mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); retval = copy_ldt(&mm->context, &old_mm->context); up(&old_mm->context.sem); } + if (retval == 0) { + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + } return retval; } @@ -134,6 +139,11 @@ void destroy_context(struct mm_struct *mm) kfree(mm->context.ldt); mm->context.size = 0; } + if (!mm->context.pinned) { + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + } } static int read_ldt(void __user * ptr, unsigned long bytecount) diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c index 74ee49b2de..f9f7454036 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c @@ -712,6 +712,7 @@ void __init paging_init(void) HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); memset(empty_zero_page, 0, sizeof(empty_zero_page)); + init_mm.context.pinned = 1; #ifdef CONFIG_XEN_PHYSDEV_ACCESS { diff --git a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c index 4afb9c26e4..454dc16f48 100644 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c @@ -12,20 +12,146 @@ #include #include #include -#include #include -void pte_free(struct page *pte) +#ifdef CONFIG_XEN +#include +#include + +LIST_HEAD(mm_unpinned); +DEFINE_SPINLOCK(mm_unpinned_lock); + +static inline void mm_walk_set_prot(void *pt, pgprot_t flags) +{ + struct page *page = virt_to_page(pt); + unsigned long pfn = page_to_pfn(page); + + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0)); +} + +static void mm_walk(struct mm_struct *mm, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g,u,m; + + pgd = mm->pgd; + for (g = 0; g <= USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + mm_walk_set_prot(pud,flags); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + mm_walk_set_prot(pmd,flags); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + pte = pte_offset_kernel(pmd,0); + mm_walk_set_prot(pte,flags); + } + } + } +} + +void mm_pin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + mm_walk(mm, PAGE_KERNEL_RO); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH)); + xen_pgd_pin(__pa(mm->pgd)); /* kernel */ + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */ + mm->context.pinned = 1; + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_unpin(struct mm_struct *mm) { - pte_t *ptep; + spin_lock(&mm->page_table_lock); - ptep = pfn_to_kaddr(page_to_pfn(pte)); + xen_pgd_unpin(__pa(mm->pgd)); + xen_pgd_unpin(__pa(__user_pgd(mm->pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0)); + mm_walk(mm, PAGE_KERNEL); + xen_tlb_flush(); + mm->context.pinned = 0; + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); - xen_pte_unpin(__pa(ptep)); - make_page_writable(ptep); - __free_page(pte); + spin_unlock(&mm->page_table_lock); } +void mm_pin_all(void) +{ + while (!list_empty(&mm_unpinned)) + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, + context.unpinned)); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if ( tsk->active_mm == mm ) + { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) + mm_unpin(mm); +} + +void pte_free(struct page *pte) +{ + unsigned long va = (unsigned long)__va(page_to_pfn(pte)< +#include + +/* + * The x86_64 doesn't have a mmu context, but + * we put the segment information here. + * + * cpu_vm_mask is used to optimize ldt flushing. + */ +typedef struct { + void *ldt; + rwlock_t ldtlock; + int size; + struct semaphore sem; +#ifdef CONFIG_XEN + unsigned pinned:1; + struct list_head unpinned; +#endif +} mm_context_t; + +#ifdef CONFIG_XEN +extern struct list_head mm_unpinned; +extern spinlock_t mm_unpinned_lock; + +/* mm/memory.c:exit_mmap hook */ +extern void _arch_exit_mmap(struct mm_struct *mm); +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) +#endif + +#endif diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h index 592bef5b7d..43512ae075 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h @@ -58,6 +58,9 @@ static inline void __prepare_arch_switch(void) } } +extern void mm_pin(struct mm_struct *mm); +extern void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -66,6 +69,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct mmuext_op _op[3], *op = _op; if (likely(prev != next)) { + if (!next->context.pinned) + mm_pin(next); + /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); #if 0 /* XEN: no lazy tlb */ diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h index d6dad2dcce..47f69fb832 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h @@ -21,12 +21,27 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t * static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + } else { + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); + } } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + } else { + *(pud) = __pud(_PAGE_TABLE | __pa(pmd)); + } } /* @@ -35,53 +50,54 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) */ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); -} - -extern __inline__ pmd_t *get_pmd(void) -{ - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - return pmd; + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); + } else { + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); + *(__user_pgd(pgd)) = *(pgd); + } } extern __inline__ void pmd_free(pmd_t *pmd) { - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - xen_pmd_unpin(__pa(pmd)); - make_page_writable(pmd); + pte_t *ptep = virt_to_ptep(pmd); + + if (!pte_write(*ptep)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } free_page((unsigned long)pmd); } static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); return pmd; } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pud) - return NULL; - make_page_readonly(pud); - xen_pud_pin(__pa(pud)); return pud; } static inline void pud_free(pud_t *pud) { - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - xen_pud_unpin(__pa(pud)); - make_page_writable(pud); + pte_t *ptep = virt_to_ptep(pud); + + if (!pte_write(*ptep)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } free_page((unsigned long)pud); } @@ -107,10 +123,6 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ - make_pages_readonly(pgd, 2); - - xen_pgd_pin(__pa(pgd)); /* kernel */ - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ /* * Set level3_user_pgt for vsyscall area */ @@ -121,31 +133,45 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void pgd_free(pgd_t *pgd) { - BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - xen_pgd_unpin(__pa(pgd)); - xen_pgd_unpin(__pa(__user_pgd(pgd))); - make_pages_writable(pgd, 2); + pte_t *ptep = virt_to_ptep(pgd); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(pgd)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } + + ptep = virt_to_ptep(__user_pgd(pgd)); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(__user_pgd(pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(pgd), + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, + PAGE_KERNEL), + 0)); + } + free_pages((unsigned long)pgd, 1); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); + if (pte) + make_page_readonly(pte); + return pte; } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - return virt_to_page((unsigned long)pte); + struct page *pte; + + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + return pte; } /* Should really implement gc for free page table pages. This could be diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h index 35fd9b530d..cc936335c3 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h @@ -18,7 +18,7 @@ extern unsigned long pgkern_mask; #define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) xen_invlpg(addr) +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) /* -- 2.30.2